#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
# Feb 17 2024
# Replication script for *training classifier* in:
# Electoral Systems And The Autocrat’s Trade-Off: Evidence From The Russian Duma
# World Politics, vol. 76, no. 4, Oct 2024
#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*

rm(list = ls(all = TRUE))
setwd(dirname(rstudioapi::getSourceEditorContext()$path)) # 


#################################################
##### READING IN PACKAGES
#################################################
pack <- c('parallel', 'jsonlite', 'httr', 'caret', 'rdd', 'rdrobust', 'xlsx', 'Rfast',"readxl", "sqldf", "lubridate", "quanteda","rvest", "randomForest", "SnowballC", "tidytext", "topicmodels", "RTextTools", "tm", "zoo", "gdata", "readr", "purrr","stringdist" , "stringi", "data.table", "jsonlite", "XML", "httr", "ggrepel", "classInt", "MASS", "tile", "simcf","sp" ,"plyr", "dplyr", "stringr", "ggplot2", "lfe", "stargazer")
lapply(pack, require, character.only=T); rm(pack)

#################################################################################################
# TRAINING CLASSIFIER
# DATA MANIPULATION AND RANDOM FOREST ESTIMATION
#################################################################################################
# https://stats.stackexchange.com/questions/175236/why-does-randomforest-confusion-matrix-not-match-the-one-i-calculate-using-predi
# https://stats.stackexchange.com/questions/30691/how-to-interpret-oob-and-confusion-matrix-for-random-forest
# http://blog.keyrus.co.uk/alteryxs_r_random_forest_output_explained.html
# https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/tutorial-random-forest-parameter-tuning-r/tutorial/

# source("~/lib/RCE TRAIN.R") # FOR THE HARVARD RCE

#texts <- readxl::read_excel("TrainingSet_final v 07112018.xlsx")[1:4000, ]
texts <- read.csv("TrainingSet_final v 07112018.csv")[1:4000, ]
texts$X__1 <- seq(1, nrow(texts), 1)
texts$dist_preference[is.na(texts$dist_preference)] <- 0
document <- as.data.frame(texts[c("X__1", "utterance")])
names(document) <- c("doc_id", "text")
corpus <- Corpus(DataframeSource(document))

# DATA PREPROCESSING: TEXT ==> DATA
# - - - - - - - - - - -- - - - - - - - - - - -- - - - - - - - - - - - - - - - 
corpus <- tm_map(corpus, content_transformer(tolower)) 
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers) 
corpus <- tm_map(corpus, removeWords, stopwords("russian")) 
corpus <- tm_map(corpus, stemDocument, language="russian") 
corpus <- tm_map(corpus, stripWhitespace) 
corpus <- quanteda::corpus(corpus) #-# Has to used quanteda's dfm() function, therefore conversion.
dtm <- dfm(corpus)

dta <- convert(dtm, to="data.frame")
dta<- dta[,-1]
dta$length <- apply(dta, 1, sum)
dta$Y <- as.factor(texts$dist_preference) # 11.5 pct. = RegDistPref
# fwrite(dta, file="train_ready.csv")
# dta <- fread("train_ready.csv", encoding='UTF-8')


#################################################################################################
# SCRIPT FOR FITTING MODEL IN HARVARD RCE
#################################################################################################

# setwd("~/Dropbox/Papers/Talk of the Home Town")
setwd("~/lib") # for HARVARD RCE
pack <- c("caret", "SnowballC","doParallel", "randomForest", "foreach", "data.table")
lapply(pack,  require, character.only=T); rm(pack)
dta <- fread("train_ready.csv")

grid <- expand.grid(mtry = c(10, 50, 75, 100, 250, 500, 1000),
                    nodesize = c(5, 20, 50, 75, 100, 200, 500), 
                    cutoff=list(c(0.8,0.2), c(.85, .15), c(0.9,0.1)))

grid <- expand.grid(mtry = c(900, 950, 975, 1000, 1050, 1100, 1300),
                    nodesize = c(2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55), 
                    cutoff=list(c(0.8,0.2), c(0.825, 0.175), c(.85, .15), c(0.875, 0.125), c(0.9,0.1), c(0.925, 0.075)))

grid <- expand.grid(mtry = c(1300, 1700, 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 15000),
                    nodesize = c(10, 15, 20, 25, 30, 35), 
                    cutoff=list(c(0.78,0.22), c(0.8,0.2), c(0.82,0.18), c(0.84, 0.16), c(0.86, 0.14), c(0.88, 0.12), c(0.9,0.1), c(0.92,0.08), c(0.94,0.06)))

grid <- expand.grid(mtry = c(1600, 2250, 2750, 3250, 4500, 10000),
                    nodesize = c(10, 15, 20, 25, 30, 35), 
                    cutoff=list(c(0.78,0.22), c(0.8,0.2), c(0.85, 0.15), c(0.9,0.1), c(0.94,0.06)))


All_models <- list() 

#################################################################################################
# TRAINING - - - - - - - - - - - OUT-OF-BAG ERROR ONLY
#################################################################################################
cores <- round(detectCores() / 1.5, 0) # THOUGH MORE CORES ARE AVAILABLE, FOREACH IS VERY MEMORY DEMANDING
cluster <- makeCluster(cores)
registerDoParallel(cluster)

for (i in 1:nrow(grid)){
  {foreach_forest <- foreach(ntree=rep(round((60/cores), 0), cores), .combine=combine, .multicombine=TRUE, .packages="randomForest") %dopar%
    randomForest(y=as.factor(dta$Y), x=dta[, -ncol(dta)], ntree=ntree, localImp = FALSE, nodesize = grid$nodesize[i], mtry = grid$mtry[i], do.trace = TRUE, cutoff = grid$cutoff[[i]]) }
  All_models[[i]] <- foreach_forest
  if(i %in% c(1, seq(0, nrow(grid), 10))){print(i)}}

stopCluster(cluster)
registerDoSEQ()

names(All_models) <- apply(grid, 1, function(x){paste(x, collapse=" - ")}) # NAMING THE MODELS IN THE LIST

# SAVING (/LOADING) ALL FITTED MODELS
max <- max(as.numeric(str_extract(list.files, '[[:digit:]]')), na.rm=T)
if (all(is.na(max))){name <- 'all_models1.R'}else{name <- paste('all_models', max+1, '.R', sep="")}
saveRDS(All_models, paste(getwd(), '/', name, sep=""))

#################################################################################################
## DETERMINING BEST MODEL (OUT-OF-BAG ERROR ESTIMATES & F1 SCORE) (ACCURACY PARADOX)
#################################################################################################
# https://gis.stackexchange.com/questions/198404/random-forest-out-of-the-bag-and-confusion-matrix
# https://machinelearningmastery.com/classification-accuracy-is-not-enough-more-performance-measures-you-can-use/
# https://medium.com/usf-msds/choosing-the-right-metric-for-evaluating-machine-learning-models-part-2-86d5649a5428

precision.score <- function(x){x$table[2, 2] / (x$table[2, 2] + x$table[2, 1])}
recall.score <- function(x){x$table[2, 2] / (x$table[2, 2] + x$table[1, 2])}
setwd("/Users/andersnielsen/Dropbox/Papers/Talk of the Home TOwn")
setwd('C:/Users/xwg574/Dropbox/Papers/Talk of the Home Town/Training Set - classify speeches')
setwd('~/Dropbox/Papers/Talk of the Home Town/Training Set - classify speeches')
dtm <- fread('train_ready.csv', encoding = 'UTF-8')

setwd("/Users/andersnielsen/Dropbox/Papers/Talk of the Home TOwn/models")
setwd('C:/Users/xwg574/Dropbox/Papers/Talk of the Home Town/models')
setwd('~/Dropbox/Papers/Talk of the Home Town/models')

model.name <- list.files()[grepl(".R",list.files())]
All_models <- list()

for (i in 1:length(model.name)){
  loop <- readRDS(model.name[i])
  All_models <- c(All_models, loop)
}; rm(loop)

All_models[unlist(lapply(All_models, function(x){class(x)!="randomForest"}))] <- NULL
conf.overview <- lapply(All_models, function(x){caret::confusionMatrix(predict(x), as.factor(dtm$Y))})
names(conf.overview) <- paste(names(All_models), 'index = ', seq(1:length(All_models)), sep=' ')

#########
# F1 = 2 * ((Precision * Recall) / (Precision + Recall))
#########
F1 <- lapply(conf.overview, function(x){( 2*((precision.score(x)*recall.score(x))/(precision.score(x)+recall.score(x))))})
F1 <- stack(F1) %>% filter(!is.nan(values)) %>% arrange(desc(values))

best_model <- as.numeric(str_extract(F1$ind[8], '[[:digit:]]{1,6}$')) # 8
conf.overview[[best_model]]
names(conf.overview)[[best_model]]
saveRDS(All_models[[best_model]],"~/lib/best_model.R")
saveRDS(All_models[[best_model]],"C:/Users/xwg574/Dropbox/Papers/Talk of the Home Town/models/best_model.R")
